1 /* 2 * Copyright (C) 2009 The Guava Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.common.base; 18 19 import static com.google.common.base.Preconditions.checkArgument; 20 import static com.google.common.base.Preconditions.checkNotNull; 21 22 import com.google.common.annotations.Beta; 23 import com.google.common.annotations.GwtCompatible; 24 25 import java.util.ArrayList; 26 import java.util.Collections; 27 import java.util.Iterator; 28 import java.util.LinkedHashMap; 29 import java.util.List; 30 import java.util.Map; 31 32 import javax.annotation.CheckReturnValue; 33 34 /** 35 * Extracts non-overlapping substrings from an input string, typically by 36 * recognizing appearances of a <i>separator</i> sequence. This separator can be 37 * specified as a single {@linkplain #on(char) character}, fixed {@linkplain 38 * #on(String) string}, {@linkplain #onPattern regular expression} or {@link 39 * #on(CharMatcher) CharMatcher} instance. Or, instead of using a separator at 40 * all, a splitter can extract adjacent substrings of a given {@linkplain 41 * #fixedLength fixed length}. 42 * 43 * <p>For example, this expression: <pre> {@code 44 * 45 * Splitter.on(',').split("foo,bar,qux")}</pre> 46 * 47 * ... produces an {@code Iterable} containing {@code "foo"}, {@code "bar"} and 48 * {@code "qux"}, in that order. 49 * 50 * <p>By default, {@code Splitter}'s behavior is simplistic and unassuming. The 51 * following expression: <pre> {@code 52 * 53 * Splitter.on(',').split(" foo,,, bar ,")}</pre> 54 * 55 * ... yields the substrings {@code [" foo", "", "", " bar ", ""]}. If this 56 * is not the desired behavior, use configuration methods to obtain a <i>new</i> 57 * splitter instance with modified behavior: <pre> {@code 58 * 59 * private static final Splitter MY_SPLITTER = Splitter.on(',') 60 * .trimResults() 61 * .omitEmptyStrings();}</pre> 62 * 63 * <p>Now {@code MY_SPLITTER.split("foo,,, bar ,")} returns just {@code ["foo", 64 * "bar"]}. Note that the order in which these configuration methods are called 65 * is never significant. 66 * 67 * <p><b>Warning:</b> Splitter instances are immutable. Invoking a configuration 68 * method has no effect on the receiving instance; you must store and use the 69 * new splitter instance it returns instead. <pre> {@code 70 * 71 * // Do NOT do this 72 * Splitter splitter = Splitter.on('/'); 73 * splitter.trimResults(); // does nothing! 74 * return splitter.split("wrong / wrong / wrong");}</pre> 75 * 76 * <p>For separator-based splitters that do not use {@code omitEmptyStrings}, an 77 * input string containing {@code n} occurrences of the separator naturally 78 * yields an iterable of size {@code n + 1}. So if the separator does not occur 79 * anywhere in the input, a single substring is returned containing the entire 80 * input. Consequently, all splitters split the empty string to {@code [""]} 81 * (note: even fixed-length splitters). 82 * 83 * <p>Splitter instances are thread-safe immutable, and are therefore safe to 84 * store as {@code static final} constants. 85 * 86 * <p>The {@link Joiner} class provides the inverse operation to splitting, but 87 * note that a round-trip between the two should be assumed to be lossy. 88 * 89 * <p>See the Guava User Guide article on <a href= 90 * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#Splitter"> 91 * {@code Splitter}</a>. 92 * 93 * @author Julien Silland 94 * @author Jesse Wilson 95 * @author Kevin Bourrillion 96 * @author Louis Wasserman 97 * @since 1.0 98 */ 99 @GwtCompatible(emulated = true) 100 public final class Splitter { 101 private final CharMatcher trimmer; 102 private final boolean omitEmptyStrings; 103 private final Strategy strategy; 104 private final int limit; 105 106 private Splitter(Strategy strategy) { 107 this(strategy, false, CharMatcher.NONE, Integer.MAX_VALUE); 108 } 109 110 private Splitter(Strategy strategy, boolean omitEmptyStrings, 111 CharMatcher trimmer, int limit) { 112 this.strategy = strategy; 113 this.omitEmptyStrings = omitEmptyStrings; 114 this.trimmer = trimmer; 115 this.limit = limit; 116 } 117 118 /** 119 * Returns a splitter that uses the given single-character separator. For 120 * example, {@code Splitter.on(',').split("foo,,bar")} returns an iterable 121 * containing {@code ["foo", "", "bar"]}. 122 * 123 * @param separator the character to recognize as a separator 124 * @return a splitter, with default settings, that recognizes that separator 125 */ 126 public static Splitter on(char separator) { 127 return on(CharMatcher.is(separator)); 128 } 129 130 /** 131 * Returns a splitter that considers any single character matched by the 132 * given {@code CharMatcher} to be a separator. For example, {@code 133 * Splitter.on(CharMatcher.anyOf(";,")).split("foo,;bar,quux")} returns an 134 * iterable containing {@code ["foo", "", "bar", "quux"]}. 135 * 136 * @param separatorMatcher a {@link CharMatcher} that determines whether a 137 * character is a separator 138 * @return a splitter, with default settings, that uses this matcher 139 */ 140 public static Splitter on(final CharMatcher separatorMatcher) { 141 checkNotNull(separatorMatcher); 142 143 return new Splitter(new Strategy() { 144 @Override public SplittingIterator iterator( 145 Splitter splitter, final CharSequence toSplit) { 146 return new SplittingIterator(splitter, toSplit) { 147 @Override int separatorStart(int start) { 148 return separatorMatcher.indexIn(toSplit, start); 149 } 150 151 @Override int separatorEnd(int separatorPosition) { 152 return separatorPosition + 1; 153 } 154 }; 155 } 156 }); 157 } 158 159 /** 160 * Returns a splitter that uses the given fixed string as a separator. For 161 * example, {@code Splitter.on(", ").split("foo, bar,baz")} returns an 162 * iterable containing {@code ["foo", "bar,baz"]}. 163 * 164 * @param separator the literal, nonempty string to recognize as a separator 165 * @return a splitter, with default settings, that recognizes that separator 166 */ 167 public static Splitter on(final String separator) { 168 checkArgument(separator.length() != 0, 169 "The separator may not be the empty string."); 170 171 return new Splitter(new Strategy() { 172 @Override public SplittingIterator iterator( 173 Splitter splitter, CharSequence toSplit) { 174 return new SplittingIterator(splitter, toSplit) { 175 @Override public int separatorStart(int start) { 176 int separatorLength = separator.length(); 177 178 positions: 179 for (int p = start, last = toSplit.length() - separatorLength; 180 p <= last; p++) { 181 for (int i = 0; i < separatorLength; i++) { 182 if (toSplit.charAt(i + p) != separator.charAt(i)) { 183 continue positions; 184 } 185 } 186 return p; 187 } 188 return -1; 189 } 190 191 @Override public int separatorEnd(int separatorPosition) { 192 return separatorPosition + separator.length(); 193 } 194 }; 195 } 196 }); 197 } 198 199 /** 200 * Returns a splitter that divides strings into pieces of the given length. 201 * For example, {@code Splitter.fixedLength(2).split("abcde")} returns an 202 * iterable containing {@code ["ab", "cd", "e"]}. The last piece can be 203 * smaller than {@code length} but will never be empty. 204 * 205 * <p><b>Exception:</b> for consistency with separator-based splitters, {@code 206 * split("")} does not yield an empty iterable, but an iterable containing 207 * {@code ""}. This is the only case in which {@code 208 * Iterables.size(split(input))} does not equal {@code 209 * IntMath.divide(input.length(), length, CEILING)}. To avoid this behavior, 210 * use {@code omitEmptyStrings}. 211 * 212 * @param length the desired length of pieces after splitting, a positive 213 * integer 214 * @return a splitter, with default settings, that can split into fixed sized 215 * pieces 216 * @throws IllegalArgumentException if {@code length} is zero or negative 217 */ 218 public static Splitter fixedLength(final int length) { 219 checkArgument(length > 0, "The length may not be less than 1"); 220 221 return new Splitter(new Strategy() { 222 @Override public SplittingIterator iterator( 223 final Splitter splitter, CharSequence toSplit) { 224 return new SplittingIterator(splitter, toSplit) { 225 @Override public int separatorStart(int start) { 226 int nextChunkStart = start + length; 227 return (nextChunkStart < toSplit.length() ? nextChunkStart : -1); 228 } 229 230 @Override public int separatorEnd(int separatorPosition) { 231 return separatorPosition; 232 } 233 }; 234 } 235 }); 236 } 237 238 /** 239 * Returns a splitter that behaves equivalently to {@code this} splitter, but 240 * automatically omits empty strings from the results. For example, {@code 241 * Splitter.on(',').omitEmptyStrings().split(",a,,,b,c,,")} returns an 242 * iterable containing only {@code ["a", "b", "c"]}. 243 * 244 * <p>If either {@code trimResults} option is also specified when creating a 245 * splitter, that splitter always trims results first before checking for 246 * emptiness. So, for example, {@code 247 * Splitter.on(':').omitEmptyStrings().trimResults().split(": : : ")} returns 248 * an empty iterable. 249 * 250 * <p>Note that it is ordinarily not possible for {@link #split(CharSequence)} 251 * to return an empty iterable, but when using this option, it can (if the 252 * input sequence consists of nothing but separators). 253 * 254 * @return a splitter with the desired configuration 255 */ 256 @CheckReturnValue 257 public Splitter omitEmptyStrings() { 258 return new Splitter(strategy, true, trimmer, limit); 259 } 260 261 /** 262 * Returns a splitter that behaves equivalently to {@code this} splitter but 263 * stops splitting after it reaches the limit. 264 * The limit defines the maximum number of items returned by the iterator. 265 * 266 * <p>For example, 267 * {@code Splitter.on(',').limit(3).split("a,b,c,d")} returns an iterable 268 * containing {@code ["a", "b", "c,d"]}. When omitting empty strings, the 269 * omitted strings do no count. Hence, 270 * {@code Splitter.on(',').limit(3).omitEmptyStrings().split("a,,,b,,,c,d")} 271 * returns an iterable containing {@code ["a", "b", "c,d"}. 272 * When trim is requested, all entries, including the last are trimmed. Hence 273 * {@code Splitter.on(',').limit(3).trimResults().split(" a , b , c , d ")} 274 * results in @{code ["a", "b", "c , d"]}. 275 * 276 * @param limit the maximum number of items returns 277 * @return a splitter with the desired configuration 278 * @since 9.0 279 */ 280 @CheckReturnValue 281 public Splitter limit(int limit) { 282 checkArgument(limit > 0, "must be greater than zero: %s", limit); 283 return new Splitter(strategy, omitEmptyStrings, trimmer, limit); 284 } 285 286 /** 287 * Returns a splitter that behaves equivalently to {@code this} splitter, but 288 * automatically removes leading and trailing {@linkplain 289 * CharMatcher#WHITESPACE whitespace} from each returned substring; equivalent 290 * to {@code trimResults(CharMatcher.WHITESPACE)}. For example, {@code 291 * Splitter.on(',').trimResults().split(" a, b ,c ")} returns an iterable 292 * containing {@code ["a", "b", "c"]}. 293 * 294 * @return a splitter with the desired configuration 295 */ 296 @CheckReturnValue 297 public Splitter trimResults() { 298 return trimResults(CharMatcher.WHITESPACE); 299 } 300 301 /** 302 * Returns a splitter that behaves equivalently to {@code this} splitter, but 303 * removes all leading or trailing characters matching the given {@code 304 * CharMatcher} from each returned substring. For example, {@code 305 * Splitter.on(',').trimResults(CharMatcher.is('_')).split("_a ,_b_ ,c__")} 306 * returns an iterable containing {@code ["a ", "b_ ", "c"]}. 307 * 308 * @param trimmer a {@link CharMatcher} that determines whether a character 309 * should be removed from the beginning/end of a subsequence 310 * @return a splitter with the desired configuration 311 */ 312 // TODO(kevinb): throw if a trimmer was already specified! 313 @CheckReturnValue 314 public Splitter trimResults(CharMatcher trimmer) { 315 checkNotNull(trimmer); 316 return new Splitter(strategy, omitEmptyStrings, trimmer, limit); 317 } 318 319 /** 320 * Splits {@code sequence} into string components and makes them available 321 * through an {@link Iterator}, which may be lazily evaluated. If you want 322 * an eagerly computed {@link List}, use {@link #splitToList(CharSequence)}. 323 * 324 * @param sequence the sequence of characters to split 325 * @return an iteration over the segments split from the parameter. 326 */ 327 public Iterable<String> split(final CharSequence sequence) { 328 checkNotNull(sequence); 329 330 return new Iterable<String>() { 331 @Override public Iterator<String> iterator() { 332 return splittingIterator(sequence); 333 } 334 @Override public String toString() { 335 return Joiner.on(", ") 336 .appendTo(new StringBuilder().append('['), this) 337 .append(']') 338 .toString(); 339 } 340 }; 341 } 342 343 private Iterator<String> splittingIterator(CharSequence sequence) { 344 return strategy.iterator(this, sequence); 345 } 346 347 /** 348 * Splits {@code sequence} into string components and returns them as 349 * an immutable list. If you want an {@link Iterable} which may be lazily 350 * evaluated, use {@link #split(CharSequence)}. 351 * 352 * @param sequence the sequence of characters to split 353 * @return an immutable list of the segments split from the parameter 354 * @since 15.0 355 */ 356 @Beta 357 public List<String> splitToList(CharSequence sequence) { 358 checkNotNull(sequence); 359 360 Iterator<String> iterator = splittingIterator(sequence); 361 List<String> result = new ArrayList<String>(); 362 363 while (iterator.hasNext()) { 364 result.add(iterator.next()); 365 } 366 367 return Collections.unmodifiableList(result); 368 } 369 370 /** 371 * Returns a {@code MapSplitter} which splits entries based on this splitter, 372 * and splits entries into keys and values using the specified separator. 373 * 374 * @since 10.0 375 */ 376 @CheckReturnValue 377 @Beta 378 public MapSplitter withKeyValueSeparator(String separator) { 379 return withKeyValueSeparator(on(separator)); 380 } 381 382 /** 383 * Returns a {@code MapSplitter} which splits entries based on this splitter, 384 * and splits entries into keys and values using the specified separator. 385 * 386 * @since 14.0 387 */ 388 @CheckReturnValue 389 @Beta 390 public MapSplitter withKeyValueSeparator(char separator) { 391 return withKeyValueSeparator(on(separator)); 392 } 393 394 /** 395 * Returns a {@code MapSplitter} which splits entries based on this splitter, 396 * and splits entries into keys and values using the specified key-value 397 * splitter. 398 * 399 * @since 10.0 400 */ 401 @CheckReturnValue 402 @Beta 403 public MapSplitter withKeyValueSeparator(Splitter keyValueSplitter) { 404 return new MapSplitter(this, keyValueSplitter); 405 } 406 407 /** 408 * An object that splits strings into maps as {@code Splitter} splits 409 * iterables and lists. Like {@code Splitter}, it is thread-safe and 410 * immutable. 411 * 412 * @since 10.0 413 */ 414 @Beta 415 public static final class MapSplitter { 416 private static final String INVALID_ENTRY_MESSAGE = 417 "Chunk [%s] is not a valid entry"; 418 private final Splitter outerSplitter; 419 private final Splitter entrySplitter; 420 421 private MapSplitter(Splitter outerSplitter, Splitter entrySplitter) { 422 this.outerSplitter = outerSplitter; // only "this" is passed 423 this.entrySplitter = checkNotNull(entrySplitter); 424 } 425 426 /** 427 * Splits {@code sequence} into substrings, splits each substring into 428 * an entry, and returns an unmodifiable map with each of the entries. For 429 * example, <code> 430 * Splitter.on(';').trimResults().withKeyValueSeparator("=>") 431 * .split("a=>b ; c=>b") 432 * </code> will return a mapping from {@code "a"} to {@code "b"} and 433 * {@code "c"} to {@code b}. 434 * 435 * <p>The returned map preserves the order of the entries from 436 * {@code sequence}. 437 * 438 * @throws IllegalArgumentException if the specified sequence does not split 439 * into valid map entries, or if there are duplicate keys 440 */ 441 public Map<String, String> split(CharSequence sequence) { 442 Map<String, String> map = new LinkedHashMap<String, String>(); 443 for (String entry : outerSplitter.split(sequence)) { 444 Iterator<String> entryFields = entrySplitter.splittingIterator(entry); 445 446 checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry); 447 String key = entryFields.next(); 448 checkArgument(!map.containsKey(key), "Duplicate key [%s] found.", key); 449 450 checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry); 451 String value = entryFields.next(); 452 map.put(key, value); 453 454 checkArgument(!entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry); 455 } 456 return Collections.unmodifiableMap(map); 457 } 458 } 459 460 private interface Strategy { 461 Iterator<String> iterator(Splitter splitter, CharSequence toSplit); 462 } 463 464 private abstract static class SplittingIterator extends AbstractIterator<String> { 465 final CharSequence toSplit; 466 final CharMatcher trimmer; 467 final boolean omitEmptyStrings; 468 469 /** 470 * Returns the first index in {@code toSplit} at or after {@code start} 471 * that contains the separator. 472 */ 473 abstract int separatorStart(int start); 474 475 /** 476 * Returns the first index in {@code toSplit} after {@code 477 * separatorPosition} that does not contain a separator. This method is only 478 * invoked after a call to {@code separatorStart}. 479 */ 480 abstract int separatorEnd(int separatorPosition); 481 482 int offset = 0; 483 int limit; 484 485 protected SplittingIterator(Splitter splitter, CharSequence toSplit) { 486 this.trimmer = splitter.trimmer; 487 this.omitEmptyStrings = splitter.omitEmptyStrings; 488 this.limit = splitter.limit; 489 this.toSplit = toSplit; 490 } 491 492 @Override protected String computeNext() { 493 /* 494 * The returned string will be from the end of the last match to the 495 * beginning of the next one. nextStart is the start position of the 496 * returned substring, while offset is the place to start looking for a 497 * separator. 498 */ 499 int nextStart = offset; 500 while (offset != -1) { 501 int start = nextStart; 502 int end; 503 504 int separatorPosition = separatorStart(offset); 505 if (separatorPosition == -1) { 506 end = toSplit.length(); 507 offset = -1; 508 } else { 509 end = separatorPosition; 510 offset = separatorEnd(separatorPosition); 511 } 512 if (offset == nextStart) { 513 /* 514 * This occurs when some pattern has an empty match, even if it 515 * doesn't match the empty string -- for example, if it requires 516 * lookahead or the like. The offset must be increased to look for 517 * separators beyond this point, without changing the start position 518 * of the next returned substring -- so nextStart stays the same. 519 */ 520 offset++; 521 if (offset >= toSplit.length()) { 522 offset = -1; 523 } 524 continue; 525 } 526 527 while (start < end && trimmer.matches(toSplit.charAt(start))) { 528 start++; 529 } 530 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) { 531 end--; 532 } 533 534 if (omitEmptyStrings && start == end) { 535 // Don't include the (unused) separator in next split string. 536 nextStart = offset; 537 continue; 538 } 539 540 if (limit == 1) { 541 // The limit has been reached, return the rest of the string as the 542 // final item. This is tested after empty string removal so that 543 // empty strings do not count towards the limit. 544 end = toSplit.length(); 545 offset = -1; 546 // Since we may have changed the end, we need to trim it again. 547 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) { 548 end--; 549 } 550 } else { 551 limit--; 552 } 553 554 return toSplit.subSequence(start, end).toString(); 555 } 556 return endOfData(); 557 } 558 } 559 } 560